knitr::opts_chunk$set(echo = TRUE)
# install.packages("plotly")
# install.packages("tidyverse")
# install.packages("styler")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(styler)
# gapminder # note the rows column was unlabeled - X1 for now
gapminder <- read_csv("gapminder_clean.csv") %>%
as_tibble()
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
gapminder1962 <- gapminder %>%
filter(Year == 1962)
ggplot(gapminder1962, aes(`CO2 emissions (metric tons per capita)`, gdpPercap)) +
geom_point() +
ylab("GDP per capita ($)") +
coord_flip()
## Warning: Removed 151 rows containing missing values (geom_point).
# coorlation test
gapminder1962_cor <- cor.test(gapminder1962$`CO2 emissions (metric tons per capita)`, gapminder1962$gdpPercap, use = "complete.obs")
gapminder1962_p <- gapminder1962_cor$p.value %>%
signif(3)
gapminder1962_c <- gapminder1962_cor$estimate %>%
round(2)
pearson_output <- paste0("Pearson correlation: ", gapminder1962_c, "\n")
pvalue_output <- paste0("P-value: ", gapminder1962_p)
Pearson correlation: 0.93
P-value: 1.13e-46
# Find strongest correlation
years <- gapminder$Year %>% unique()
coors <- c()
for (y in years) {
gapminderyear <- gapminder %>%
filter(Year == y)
gap_cor <- cor.test(gapminderyear$`CO2 emissions (metric tons per capita)`, gapminderyear$gdpPercap, use = "complete.obs") %>%
.$estimate %>%
round(3)
cat(paste0(y, " pearson correlation: ", gap_cor, "\n"))
coors <- c(coors, gap_cor)
}
## 1962 pearson correlation: 0.926
## 1967 pearson correlation: 0.939
## 1972 pearson correlation: 0.843
## 1977 pearson correlation: 0.793
## 1982 pearson correlation: 0.817
## 1987 pearson correlation: 0.81
## 1992 pearson correlation: 0.809
## 1997 pearson correlation: 0.808
## 2002 pearson correlation: 0.801
## 2007 pearson correlation: 0.72
max_year <- years[coors == max(coors)]
max_year_output <- paste0("\n", "Correlation is strongest in ", max_year)
Correlation is strongest in 1967
plot1 <- gapminder %>%
filter(Year == max_year) %>%
ggplot(aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap, size = pop, color = continent)) +
geom_point() +
ylab("GDP per capita ($)") +
coord_flip()
ggplotly(plot1)
# Filter out NAs
gapminder2 <- gapminder %>%
filter(
!is.na("Energy use (kg of oil equivalent per capita)"),
!is.na(continent)
) %>%
rename(energy = "Energy use (kg of oil equivalent per capita)")
# Run anova (appropriate for quantitative data organized in over 2 categories)
gapminder_aov <- aov(energy ~ continent, data = gapminder2, na.action = na.omit) %>%
summary()
# gapminder_aov[[1]]
gapminder_aovP <- gapminder_aov[[1]][1, 5]
# Plot boxplots
ggplot(gapminder2, aes(continent, energy)) +
# geom_violin() +
geom_boxplot() +
ylab("Energy use (kg of oil equivalent per capita)")
## Warning: Removed 436 rows containing non-finite values (stat_boxplot).
# answer
energy_output <- paste("A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOVA (appropriate for quantitative data organized in over 2 categories). p-value:", gapminder_aovP)
A: There are big differences in energy use across the 5 continents. This can be seen in the boxplots and is highly significant by ANOVA (appropriate for quantitative data organized in over 2 categories). p-value: 8.52700348715528e-39
gapminder1990 <- gapminder %>%
filter(Year > 1990, (continent == "Asia" | continent == "Europe"))
# Run t-test (appropriate to compare between two categories of quantitative data)
test_output <- t.test(
filter(gapminder1990, continent == "Asia")$`Imports of goods and services (% of GDP)`,
filter(gapminder1990, continent == "Europe")$`Imports of goods and services (% of GDP)`
)
ggplot(gapminder1990, aes(continent, `Imports of goods and services (% of GDP)`)) +
geom_boxplot()
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
output <- paste("A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test (appropriate to compare between two categories of quantitative data). p-value:", test_output$p.value)
A: There is not a significant difference between the two, as seen in the boxplot and quantified by t.test (appropriate to compare between two categories of quantitative data). p-value: 0.177569118980769
### Average for each country ###
gapminder_countries <- gapminder %>%
group_by(`Country Name`) %>%
summarise_at(vars(`Population density (people per sq. km of land area)`), list(avg_density = mean)) %>%
arrange(desc(avg_density))
# print head and plot
# gapminder_countries %>% head()
plot1 <- ggplot(gapminder_countries, aes(x = `Country Name`, y = avg_density)) +
geom_col() +
theme(axis.text.x = element_blank()) +
xlab("Countries")
ggplotly(plot1)
## Warning: Removed 9 rows containing missing values (position_stack).
### Rank across each country and plot ##
# make blank df
gapminder_countries_rank <- gapminder %>%
mutate(rank = 1) %>%
filter(rank == 2)
# Rank countries each year
for (y in years) {
gapminder1 <- gapminder %>%
filter(Year == y) %>%
mutate(rank = rank(-`Population density (people per sq. km of land area)`))
head(gapminder1)
gapminder_countries_rank <- rbind(gapminder_countries_rank, gapminder1)
}
# Average for each country
gapminder_countries_rank <- gapminder_countries_rank %>%
group_by(`Country Name`) %>%
summarise_at(vars(rank), list(avg_density_rank = mean)) %>%
arrange(avg_density_rank)
# print head
# gapminder_countries_rank %>% head()
output <- paste0("A: The highest average across all years is ", gapminder_countries[1, ]$`Country Name`, ". However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: ", gapminder_countries_rank[1, ]$`Country Name`, " and ", gapminder_countries_rank[2, ]$`Country Name`)
A: The highest average across all years is Macao SAR, China. However, the highest average RANKING (rank 1-263, averaged across the 10 years of data, if they exist for the country) is a tie between these two countries: Macao SAR, China and Monaco
# List countries, set baseline in 1962
countries <- gapminder$`Country Name` %>% unique()
gapminder1962 <- filter(gapminder, Year == 1962, !is.na(`Life expectancy at birth, total (years)`))
gapminder_exp <- mutate(gapminder, Life_expectancy_vs_1962 = 1) %>%
filter(Life_expectancy_vs_1962 == 2)
# Make new dataframe with life exp vs 1962
for (c in countries) {
exp1962 <- filter(gapminder1962, `Country Name` == c)[1, ]
gapminder_exp1 <- filter(gapminder, `Country Name` == c) %>%
mutate(Life_expectancy_vs_1962 = `Life expectancy at birth, total (years)` - exp1962$`Life expectancy at birth, total (years)`)
gapminder_exp <- rbind(gapminder_exp, gapminder_exp1)
}
# Plot
plot1 <- gapminder_exp %>%
ggplot(aes(x = Year, y = Life_expectancy_vs_1962, color = `Country Name`)) +
geom_line() +
ylab("Life expectancy change since 1962 (years)")
ggplotly(plot1)
# Extract top value
gapminder_exp_top <- gapminder_exp %>%
arrange(desc(Life_expectancy_vs_1962))
output <- paste(gapminder_exp_top[1, ]$`Country Name`, "has shown the greatest increase in life expectancy.")
Maldives has shown the greatest increase in life expectancy.